{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from diskcache import Cache\n",
    "from sweepai.config.server import CACHE_DIRECTORY\n",
    "from sweepai.core.lexical_search import get_lexical_cache_key, CustomIndex, tokenize_code\n",
    "\n",
    "lexical_index_cache = Cache(f'{CACHE_DIRECTORY}/lexical_index_cache')\n",
    "snippets_cache = Cache(f'{CACHE_DIRECTORY}/snippets_cache')\n",
    "\n",
    "lexical_cache_key = get_lexical_cache_key(\"/tmp/aurea-crm\")\n",
    "snippets, file_list = snippets_cache.get(lexical_cache_key)\n",
    "index = lexical_index_cache.get(lexical_cache_key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "with open(\"tmp.txt\", \"w\") as f:\n",
    "    for k, v in tqdm(index.inverted_index.items()):\n",
    "        f.write(f\"{k}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tantivy\n",
    "\n",
    "schema_builder = tantivy.SchemaBuilder()\n",
    "schema_builder.add_text_field(\"body\",stored=True)\n",
    "schema_builder.add_integer_field(\"doc_id\",stored=True)\n",
    "schema = schema_builder.build()\n",
    "\n",
    "tantivy_index = tantivy.Index(schema, path=\"tantivy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sweepai.core.lexical_search import tokenize_code\n",
    "from tqdm import tqdm\n",
    "\n",
    "\n",
    "all_tokens = [\n",
    "    tokenize_code(snippet.get_snippet(False, False)) for snippet in tqdm(snippets)\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "writer = tantivy_index.writer()\n",
    "\n",
    "for i, tokens in enumerate(tqdm(all_tokens)):\n",
    "    writer.add_document(\n",
    "        tantivy.Document(\n",
    "            body=\" \".join(tokens),\n",
    "            doc_id=i\n",
    "        )\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "writer.commit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tantivy_index.reload()\n",
    "searcher = tantivy_index.searcher()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"hello world\"\n",
    "query = \" \".join(tokenize_code(query))\n",
    "query = tantivy_index.parse_query(query, [\"body\"])\n",
    "\n",
    "for score, address in  searcher.search(query, 3).hits:\n",
    "    print(searcher.doc(address)[\"body\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "searcher.doc(top_doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "snippets[11478].content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(tantivy_index)\n",
    "print(index.metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "numpy_inverted_index = {\n",
    "    k: np.array(v, dtype=(np.uint16, np.uint16))\n",
    "    for k, v in tqdm(index.inverted_index.items())\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "sys.getsizeof(numpy_inverted_index) / 1024 / 1024"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sum([a.nbytes for a in numpy_inverted_index.values()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open(\"tmp.pkl\", \"wb\") as f:\n",
    "    pickle.dump(numpy_inverted_index, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open(\"tmp.txt\", \"wb\") as f:\n",
    "    for k, arr in tqdm(numpy_inverted_index.items()):\n",
    "        f.write(k.encode())\n",
    "        f.write(b\"\\n\")\n",
    "        f.write(arr.tobytes())\n",
    "        f.write(b\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newly_loaded = {}\n",
    "\n",
    "with open(\"tmp.txt\", \"rb\") as f:\n",
    "    while True:\n",
    "        k = f.readline().strip()\n",
    "        print(k)\n",
    "        if not k:\n",
    "            break\n",
    "        arr_encoded = f.readline()\n",
    "        print(arr_encoded)\n",
    "        arr = np.frombuffer(arr_encoded, dtype=(np.uint16, np.uint16))\n",
    "        newly_loaded[k] = arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr.tofile(\"tmp.npy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_a, max_b = 0, 0\n",
    "\n",
    "for k, v in tqdm(index.inverted_index.items()):\n",
    "    for a, b in v:\n",
    "        max_a = max(max_a, a)\n",
    "        max_b = max(max_b, b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(max_a, max_b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max(v[0] for v in index.inverted_index.values())\n",
    "max(v[1] for v in index.inverted_index.values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "other_index = index.inverted_index\n",
    "\n",
    "for k, v in tqdm(other_index.items()):\n",
    "    print(v) # store v as a 2 item numpy array\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open(\"tmp.json\", \"w\") as f:\n",
    "    json.dump(index.inverted_index, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyarrow as pa\n",
    "# Convert the dictionary to a PyArrow table\n",
    "table = pa.Table.from_pydict(index.inverted_index)\n",
    "# Serialize the table to a file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pa.OSFile('data.arrow', 'wb') as f:\n",
    "    pa.ipc.write_table(table, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Deserialize the table from a file\n",
    "with pa.OSFile('data.arrow', 'rb') as f:\n",
    "    table = pa.ipc.read_table(f)\n",
    "# Convert the table back to a dictionary\n",
    "deserialized_data = table.to_pydict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lengths = [len(k) for k in index.inverted_index.keys()]\n",
    "lengths = [len(k) for k in index.inverted_index.keys() if \"_\" not in k]\n",
    "sorted_keys = sorted([k for k in index.inverted_index.keys() if \"_\" not in k], key=lambda x: len(x), reverse=True)\n",
    "sizes = {token: len(index.inverted_index[token]) for token in index.inverted_index.keys()}\n",
    "sorted_sizes = sorted(sizes.items(), key=lambda x: x[1], reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get most common terms\n",
    "\n",
    "cdf = [0]\n",
    "\n",
    "for i in range(1, len(sorted_sizes)):\n",
    "    cdf.append(cdf[-1] + sorted_sizes[i][1])\n",
    "\n",
    "# diagram the cdf using matplotlib\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.plot([n / cdf[-1] for n in cdf][:1000])\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted_sizes.index((\"div\", 2385))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_tokens = [k for k in index.inverted_index.keys() if \"_\" not in k][:700]\n",
    "\n",
    "new_inverted_index = {k: index.inverted_index[k] for k in index.inverted_index if k not in top_tokens}\n",
    "print(len(new_inverted_index))\n",
    "print(len(index.inverted_index))\n",
    "# print(len(top_tokens))\n",
    "# print(top_tokens[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "print(sum(len(v) for v in index.inverted_index.values()))\n",
    "print(sum(len(v) for v in new_inverted_index.values()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_freq = 0\n",
    "max_document_index = 0\n",
    "\n",
    "for k, v in index.inverted_index.items():\n",
    "    for doc_id, freq in v:\n",
    "        if freq > max_freq:\n",
    "            max_freq = freq\n",
    "        if doc_id > max_document_index:\n",
    "            max_document_index = doc_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_freq, max_document_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import msgpack\n",
    "\n",
    "# Serialization\n",
    "data = new_inverted_index\n",
    "serialized_data = msgpack.packb(data)\n",
    "len(serialized_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"inverted_index.msgpack\", \"wb\") as f:\n",
    "    f.write(serialized_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"inverted_index.msgpack\", \"rb\") as f:\n",
    "    message = f.read()\n",
    "\n",
    "# Deserialization\n",
    "deserialized_data = msgpack.unpackb(message)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "pickled_string = pickle.dumps(new_inverted_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# histogram of lengths of index.inverted_index's keys\n",
    "# use matplotlib\n",
    "\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "plt.hist(lengths, bins=20, range=(0, 100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "keys = [k for k in index.inverted_index.keys() if len(k) == 20]\n",
    "print(keys)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
